import numpy as np
import pandas as pd
from tqdm import tqdm
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "notebook"
categories = [
'frontpage',
'news',
'tech',
'local',
'opinion',
'on-air',
'misc',
'weather',
'msn-news',
'health',
'living',
'business',
'msn-sports',
'sports',
'summary',
'bbs',
'travel'
]
def parse_dataset(path='./data/msnbc/data.seq'):
with open(path) as f:
user = 1
data = []
for line in f:
split_line = line.split()
user_cateogories = [categories[int(i) - 1] for i in split_line]
data.append(np.array(user_cateogories, dtype=object))
user += 1
return np.array(data, dtype=object)
def get_refresh_data(category, user_visits):
"""
Sliding window approach to find the longest/shortest
continuous subsequence in O(n) time
"""
longest, shortest, start, end = 0, len(user_visits) + 1, 0, len(user_visits)
current, refreshes = 0, 0
while start < end:
if user_visits[start] == category:
current += 1
else:
if current >= 2:
current -= 1 #substract the initial visit
refreshes += current
longest = current if current > longest else longest
shortest = current if current < shortest else shortest
current = 0
start += 1
if current >= 2:
current -= 1 #substract the initial visit
refreshes += current
longest = current if current > longest else longest
shortest = current if current < shortest else shortest
return refreshes, longest, shortest if shortest < len(user_visits) + 1 else 0
def get_revisit_data(category, user_visits):
revisits, longest, shortest = 0, 0, len(user_visits) + 1
previous = -1
for current, visit in enumerate(user_visits):
if visit == category:
if previous == -1:
previous = current
else:
distance = current - previous
previous = current
if distance > 1: #not a refresh
distance -= 1 #substract the initial visit
revisits += 1
longest = distance if distance > longest else longest
shortest = distance if distance < shortest else shortest
return revisits, longest, shortest if shortest < len(user_visits) + 1 else 0
def get_dataset_stats(data):
categories_stats = {}
for category in tqdm(categories):
categories_stats[category] = {
'visits': 0, #1 2 2 1, here 1 and 2 were visited both 2 times
'unique_visits': 0, #1 2 2 1, here 1 and 2 were visited both 1 time
'refreshes': 0, #1 1 1, here 1 counts as refreshed 2 times
'unique_refreshes': 0, #refreshes counted once per user
'revisits': 0, #1 2 2 1, here 1 counts as revisited, 2 as refreshed
'unique_revisits': 0, #revisits counted once per user
'longest_refresh': 0, #1 1 1 1, here 1 was refreshed 4 times
'shortest_refresh': 0,
'longest_revisit': 0, #1 2 2 1, revisit distance of 2 sites for 1
'shortest_revisit': 0,
}
for user_visits in data:
categories_stats[category]['visits'] += (user_visits == category).sum()
categories_stats[category]['unique_visits'] += 1 if category in user_visits else 0
refresh_data = get_refresh_data(category, user_visits)
categories_stats[category]['refreshes'] += refresh_data[0]
categories_stats[category]['unique_refreshes'] += (1 if refresh_data[0] > 0 else 0)
if refresh_data[1] > categories_stats[category]['longest_refresh']:
categories_stats[category]['longest_refresh'] = refresh_data[1]
if refresh_data[2] != 0 and (refresh_data[2] < categories_stats[category]['shortest_refresh']\
or categories_stats[category]['shortest_refresh'] == 0):
categories_stats[category]['shortest_refresh'] = refresh_data[2]
revisit_data = get_revisit_data(category, user_visits)
categories_stats[category]['revisits'] += revisit_data[0]
categories_stats[category]['unique_revisits'] += (1 if revisit_data[0] > 0 else 0)
if revisit_data[1] > categories_stats[category]['longest_revisit']:
categories_stats[category]['longest_revisit'] = revisit_data[1]
if revisit_data[2] != 0 and (revisit_data[2] < categories_stats[category]['shortest_revisit']\
or categories_stats[category]['shortest_revisit'] == 0):
categories_stats[category]['shortest_revisit'] = revisit_data[2]
return categories_stats
def plot_categories_statistics(categories, categories_stats):
max_rows, max_cols = 9, 2
row, col = 1, 1
fig = make_subplots(
rows=max_rows, cols=max_cols,
subplot_titles=categories)
for category in categories:
X, Y = [], []
for key in categories_stats[category]:
X.append(key)
Y.append(categories_stats[category][key])
fig.add_trace(
go.Histogram(histfunc='sum', x=X, y=Y, name=category),
row=row, col=col
)
if col % max_cols == 0:
col = 0
row += 1
col += 1
fig.update_layout(
title_text='Categories statistics',
bargap=0.1,
height=max_rows * 300,
width=max_cols * 650
)
fig.show()
def plot_statistics_categories(categories, categories_stats):
max_rows, max_cols = 10, 1
row, col = 1, 1
statistics = []
for key in categories_stats[categories[0]]:
statistics.append(key)
fig = make_subplots(
rows=max_rows, cols=max_cols,
subplot_titles=statistics)
for statistic in statistics:
X, Y = [], []
for category in categories:
X.append(category)
Y.append(categories_stats[category][statistic])
fig.add_trace(
go.Histogram(histfunc='sum', x=X, y=Y, name=statistic),
row=row, col=col
)
if col % max_cols == 0:
col = 0
row += 1
col += 1
fig.update_layout(
title_text='Statistics categories',
bargap=0.1,
height=max_rows * 300,
width=max_cols * 1200
)
fig.show()
data = parse_dataset('./data/msnbc/data.seq')
data[:5]
array([array(['frontpage', 'frontpage'], dtype=object),
array(['news'], dtype=object),
array(['tech', 'news', 'news', 'local', 'news', 'news', 'news', 'tech',
'tech'], dtype=object) ,
array(['opinion'], dtype=object),
array(['frontpage'], dtype=object)], dtype=object)
dataset_stats = get_dataset_stats(data)
dataset_stats
100%|██████████| 17/17 [04:21<00:00, 15.38s/it]
{'frontpage': {'visits': 940469,
'unique_visits': 313181,
'refreshes': 526123,
'unique_refreshes': 183510,
'revisits': 208524,
'unique_revisits': 106246,
'longest_refresh': 14794,
'shortest_refresh': 1,
'longest_revisit': 1783,
'shortest_revisit': 1},
'news': {'visits': 452387,
'unique_visits': 175286,
'refreshes': 247557,
'unique_refreshes': 87452,
'revisits': 54761,
'unique_revisits': 32303,
'longest_refresh': 725,
'shortest_refresh': 1,
'longest_revisit': 583,
'shortest_revisit': 1},
'tech': {'visits': 207479,
'unique_visits': 121948,
'refreshes': 80048,
'unique_refreshes': 36169,
'revisits': 13543,
'unique_revisits': 9683,
'longest_refresh': 2057,
'shortest_refresh': 1,
'longest_revisit': 775,
'shortest_revisit': 1},
'local': {'visits': 386217,
'unique_visits': 121719,
'refreshes': 245416,
'unique_refreshes': 58429,
'revisits': 39607,
'unique_revisits': 22843,
'longest_refresh': 534,
'shortest_refresh': 1,
'longest_revisit': 1431,
'shortest_revisit': 1},
'opinion': {'visits': 151409,
'unique_visits': 24987,
'refreshes': 122160,
'unique_refreshes': 15213,
'revisits': 7435,
'unique_revisits': 4453,
'longest_refresh': 439,
'shortest_refresh': 1,
'longest_revisit': 1102,
'shortest_revisit': 1},
'on-air': {'visits': 414928,
'unique_visits': 217101,
'refreshes': 173252,
'unique_refreshes': 70929,
'revisits': 53845,
'unique_revisits': 33531,
'longest_refresh': 1257,
'shortest_refresh': 1,
'longest_revisit': 551,
'shortest_revisit': 1},
'misc': {'visits': 305615,
'unique_visits': 80514,
'refreshes': 197506,
'unique_refreshes': 58746,
'revisits': 47584,
'unique_revisits': 28909,
'longest_refresh': 124,
'shortest_refresh': 1,
'longest_revisit': 1431,
'shortest_revisit': 1},
'weather': {'visits': 439398,
'unique_visits': 95615,
'refreshes': 335018,
'unique_refreshes': 67143,
'revisits': 14548,
'unique_revisits': 9319,
'longest_refresh': 218,
'shortest_refresh': 1,
'longest_revisit': 1511,
'shortest_revisit': 1},
'msn-news': {'visits': 196614,
'unique_visits': 90192,
'refreshes': 95537,
'unique_refreshes': 43656,
'revisits': 25863,
'unique_revisits': 17718,
'longest_refresh': 289,
'shortest_refresh': 1,
'longest_revisit': 2325,
'shortest_revisit': 1},
'health': {'visits': 131760,
'unique_visits': 50606,
'refreshes': 74147,
'unique_refreshes': 21950,
'revisits': 13246,
'unique_revisits': 7933,
'longest_refresh': 96,
'shortest_refresh': 1,
'longest_revisit': 575,
'shortest_revisit': 1},
'living': {'visits': 96817,
'unique_visits': 57597,
'refreshes': 35703,
'unique_refreshes': 17398,
'revisits': 8680,
'unique_revisits': 6149,
'longest_refresh': 78,
'shortest_refresh': 1,
'longest_revisit': 776,
'shortest_revisit': 1},
'business': {'visits': 264899,
'unique_visits': 112183,
'refreshes': 141175,
'unique_refreshes': 46468,
'revisits': 22880,
'unique_revisits': 14312,
'longest_refresh': 893,
'shortest_refresh': 1,
'longest_revisit': 938,
'shortest_revisit': 1},
'msn-sports': {'visits': 216125,
'unique_visits': 76948,
'refreshes': 129522,
'unique_refreshes': 47516,
'revisits': 20169,
'unique_revisits': 14823,
'longest_refresh': 440,
'shortest_refresh': 1,
'longest_revisit': 2098,
'shortest_revisit': 1},
'sports': {'visits': 395880,
'unique_visits': 119138,
'refreshes': 263167,
'unique_refreshes': 72063,
'revisits': 24150,
'unique_revisits': 15525,
'longest_refresh': 310,
'shortest_refresh': 1,
'longest_revisit': 839,
'shortest_revisit': 1},
'summary': {'visits': 56576,
'unique_visits': 29200,
'refreshes': 22110,
'unique_refreshes': 9890,
'revisits': 13926,
'unique_revisits': 7078,
'longest_refresh': 824,
'shortest_refresh': 1,
'longest_revisit': 1822,
'shortest_revisit': 1},
'bbs': {'visits': 25249,
'unique_visits': 2082,
'refreshes': 22510,
'unique_refreshes': 1631,
'revisits': 1022,
'unique_revisits': 419,
'longest_refresh': 729,
'shortest_refresh': 1,
'longest_revisit': 1342,
'shortest_revisit': 1},
'travel': {'visits': 16972,
'unique_visits': 11006,
'refreshes': 5417,
'unique_refreshes': 2925,
'revisits': 1510,
'unique_revisits': 1048,
'longest_refresh': 19,
'shortest_refresh': 1,
'longest_revisit': 2299,
'shortest_revisit': 1}}
plot_categories_statistics(categories, dataset_stats)
plot_statistics_categories(categories, dataset_stats)